Import Packages
library(ggplot2)
library(plotly)
library(data.table)
Import Data and look at first couple records
setwd("F:/kaggle/Mercedes-Benz")
sample_submission <- fread("data/sample_submission.csv")
train <- fread("data/train.csv")
test <- fread("data/test.csv")
train[1:2,]
# 1 - ID
# 2 - Response Varuable (Time in Seconds on Test Stand)
# 3-10 String Option Codes
# 11-385 0/1 values based on option codes
# X7 and X9 are missinf for some reason.
Calculate Mean and SD for Response Variable (Y). This is the time (sec) for a vehicle on the MB test station.
mean(train[,y])
[1] 100.6693
sd(train[,y])
[1] 12.67938
Create plots for the Response Variable. Sorted by ID.
plot_ly(y=train$y, type="scatter")
Create plots for the Response Variable. Sorted by the Response Variable.
plot_ly(y=train[order(y),]$y, type="scatter")
Create histogram for the Response Variable.
plot_ly(x=train$y, type="histogram")
Create histogram for the LOG(Response Variable).
plot_ly(x=log(train$y), type="histogram")
Frequency tables for the first 10 variables (All with String Codes)
cbind(train[,.N,by=X0],Mean_Y=round(train[,mean(y),by=X0]$V1,1))[order(-N)]
cbind(train[,.N,by=X1],Mean_Y=round(train[,mean(y),by=X1]$V1,1))[order(-N)]
cbind(train[,.N,by=X2],Mean_Y=round(train[,mean(y),by=X2]$V1,1))[order(-N)]
cbind(train[,.N,by=X3],Mean_Y=round(train[,mean(y),by=X3]$V1,1))[order(-N)]
cbind(train[,.N,by=X4],Mean_Y=round(train[,mean(y),by=X4]$V1,1))[order(-N)]
cbind(train[,.N,by=X5],Mean_Y=round(train[,mean(y),by=X5]$V1,1))[order(-N)]
cbind(train[,.N,by=X6],Mean_Y=round(train[,mean(y),by=X6]$V1,1))[order(-N)]
cbind(train[,.N,by=X8],Mean_Y=round(train[,mean(y),by=X8]$V1,1))[order(-N)]#where is 7
#where is 9
Frequency plots for the first 10 variables (All with String Codes)
l <- htmltools::tagList()
for (i in 1:length(column_names_for_option_plots_string_codes)) { #
plot_variable <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=plot_variable][order(-N)])[,plot_variable]
y_values <- as.data.frame(train[,.N,by=plot_variable][order(-N)])[,'N']
l[[i]] <- plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=plot_variable)
}
l
Frequency table for the remaining variables (All with 0/1 Coding)
column_names_for_summary <- colnames(train)[11:length(colnames(train))]
Variable <- c()
N_0 <- c()
N_1 <- c()
Mean_0 <- c()
Mean_1 <- c()
for(i in column_names_for_summary){
j<-1
Variable <- c(Variable,i)
N_0 <- c(N_0,train[,.N,by=i][1]$N)
N_1 <- c(N_1,train[,.N,by=i][2]$N)
Mean_0 <- c(Mean_0,round(train[,mean(y),by=i][1]$V1,1))
Mean_1 <- c(Mean_1,round(train[,mean(y),by=i][2]$V1,1))
j <- j+1
}
# N_0[is.na(N_0)==TRUE] <- 0
# N_1[is.na(N_1)==TRUE] <- 0
# Mean_0[is.na(Mean_0)==TRUE] <- 0
# Mean_1[is.na(Mean_1)==TRUE] <- 0
summary_results <- as.data.frame(cbind(Variable,N_0=N_0,N_1,Mean_0,Mean_1), stringsAsFactors = FALSE)
summary_results$N_0 <- as.integer(summary_results$N_0)
summary_results$N_1 <- as.integer(summary_results$N_1)
summary_results$Mean_0 <- as.numeric(summary_results$Mean_0)
summary_results$Mean_1 <- as.numeric(summary_results$Mean_1)
summary_results$Delta_Mean <- summary_results$Mean_1 - summary_results$Mean_0
summary_results
After Looking at Kaggle, checked for Duplicate fileds - Added Data to Frequency table for the remaining variables (All with 0/1 Coding).
train_2 <- train[, !duplicated(t(train))] #remove duplicated fields ... from raddar@Kaggle => https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34006
duplicate_column <- c()
for(i in duplicated(t(train))){
duplicate_column <- c(duplicate_column,i)
}
duplicate_column <- duplicate_column[11:length(duplicate_column)]
summary_results$duplicate_column <- duplicate_column
summary_results_decreasing <- summary_results[order(-N_1),]
summary_results
summary_results_decreasing
List of Duplicated Columns
summary_results_decreasing[(summary_results_decreasing$duplicate_column==TRUE),]
Create Response Plots for all Binary Options
train_df <- as.data.frame(train)
train_df_dedupe <- train_df[, !duplicated(t(train_df))] #remove duplicated fields ... from raddar@Kaggle => https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34006
column_names_for_binary_plots <- colnames(train_df_dedupe)[11:length(colnames(train_df_dedupe))]
l <- htmltools::tagList()
for (i in 1:length(column_names_for_binary_plots)) { #
plot_variable <- column_names_for_binary_plots[i]
l[[i]] <- plot_ly(x=train_df_dedupe[,plot_variable],y=train_df_dedupe$y, type="box", height=300) %>% layout(title=plot_variable)
}
l
LS0tDQp0aXRsZTogIk1lcmNlZGVzLUJlbnogRURBIE5vdGVib29rIg0KYXV0aG9yOiAiSmVmZiBIZWRiZXJnIg0KZGF0ZTogIjUtSnVuZS0yMDE3Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOiBkZWZhdWx0DQogIGh0bWxfZG9jdW1lbnQ6IGRlZmF1bHQNCi0tLQ0KPGJyPg0KDQojIyMjIEltcG9ydCBQYWNrYWdlcw0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShwbG90bHkpDQpsaWJyYXJ5KGRhdGEudGFibGUpDQpgYGANCg0KPGJyPg0KDQojIyMjICBJbXBvcnQgRGF0YSBhbmQgbG9vayBhdCBmaXJzdCBjb3VwbGUgcmVjb3Jkcw0KYGBge3J9DQpzZXR3ZCgiRjova2FnZ2xlL01lcmNlZGVzLUJlbnoiKQ0KDQpzYW1wbGVfc3VibWlzc2lvbiA8LSBmcmVhZCgiZGF0YS9zYW1wbGVfc3VibWlzc2lvbi5jc3YiKQ0KdHJhaW4gPC0gZnJlYWQoImRhdGEvdHJhaW4uY3N2IikNCnRlc3QgPC0gZnJlYWQoImRhdGEvdGVzdC5jc3YiKQ0KDQp0cmFpblsxOjIsXQ0KDQojIDEgLSBJRA0KIyAyIC0gUmVzcG9uc2UgVmFydWFibGUgKFRpbWUgaW4gU2Vjb25kcyBvbiBUZXN0IFN0YW5kKQ0KIyAzLTEwICBTdHJpbmcgT3B0aW9uIENvZGVzDQojIDExLTM4NSAwLzEgdmFsdWVzIGJhc2VkIG9uIG9wdGlvbiBjb2Rlcw0KDQojIFg3IGFuZCBYOSBhcmUgbWlzc2luZiBmb3Igc29tZSByZWFzb24uDQpgYGANCjxicj4NCg0KIyMjIyAgQ2FsY3VsYXRlIE1lYW4gYW5kIFNEIGZvciBSZXNwb25zZSBWYXJpYWJsZSAoWSkuICBUaGlzIGlzIHRoZSB0aW1lIChzZWMpIGZvciBhIHZlaGljbGUgb24gdGhlIE1CIHRlc3Qgc3RhdGlvbi4NCmBgYHtyfQ0KbWVhbih0cmFpblsseV0pDQpzZCh0cmFpblsseV0pDQpgYGANCjxicj4NCg0KIyMjIyAgQ3JlYXRlIHBsb3RzIGZvciB0aGUgUmVzcG9uc2UgVmFyaWFibGUuICBTb3J0ZWQgYnkgSUQuDQpgYGB7ciwgbWVzc2FnZT1GQUxTRX0NCnBsb3RfbHkoeT10cmFpbiR5LCB0eXBlPSJzY2F0dGVyIikNCmBgYA0KPGJyPg0KDQojIyMjICBDcmVhdGUgcGxvdHMgZm9yIHRoZSBSZXNwb25zZSBWYXJpYWJsZS4gIFNvcnRlZCBieSB0aGUgUmVzcG9uc2UgVmFyaWFibGUuDQpgYGB7ciwgbWVzc2FnZT1GQUxTRX0NCnBsb3RfbHkoeT10cmFpbltvcmRlcih5KSxdJHksIHR5cGU9InNjYXR0ZXIiKQ0KYGBgDQo8YnI+DQoNCiMjIyMgIENyZWF0ZSBoaXN0b2dyYW0gZm9yIHRoZSBSZXNwb25zZSBWYXJpYWJsZS4NCmBgYHtyfQ0KIHBsb3RfbHkoeD10cmFpbiR5LCB0eXBlPSJoaXN0b2dyYW0iKQ0KYGBgDQo8YnI+DQoNCiMjIyMgIENyZWF0ZSBoaXN0b2dyYW0gZm9yIHRoZSBMT0coUmVzcG9uc2UgVmFyaWFibGUpLg0KYGBge3J9DQogcGxvdF9seSh4PWxvZyh0cmFpbiR5KSwgdHlwZT0iaGlzdG9ncmFtIikNCmBgYA0KPGJyPg0KDQojIyMjICBGcmVxdWVuY3kgdGFibGVzIGZvciB0aGUgZmlyc3QgMTAgdmFyaWFibGVzIChBbGwgd2l0aCBTdHJpbmcgQ29kZXMpDQpgYGB7cn0NCmNiaW5kKHRyYWluWywuTixieT1YMF0sTWVhbl9ZPXJvdW5kKHRyYWluWyxtZWFuKHkpLGJ5PVgwXSRWMSwxKSlbb3JkZXIoLU4pXQ0KY2JpbmQodHJhaW5bLC5OLGJ5PVgxXSxNZWFuX1k9cm91bmQodHJhaW5bLG1lYW4oeSksYnk9WDFdJFYxLDEpKVtvcmRlcigtTildDQpjYmluZCh0cmFpblssLk4sYnk9WDJdLE1lYW5fWT1yb3VuZCh0cmFpblssbWVhbih5KSxieT1YMl0kVjEsMSkpW29yZGVyKC1OKV0NCmNiaW5kKHRyYWluWywuTixieT1YM10sTWVhbl9ZPXJvdW5kKHRyYWluWyxtZWFuKHkpLGJ5PVgzXSRWMSwxKSlbb3JkZXIoLU4pXQ0KY2JpbmQodHJhaW5bLC5OLGJ5PVg0XSxNZWFuX1k9cm91bmQodHJhaW5bLG1lYW4oeSksYnk9WDRdJFYxLDEpKVtvcmRlcigtTildDQpjYmluZCh0cmFpblssLk4sYnk9WDVdLE1lYW5fWT1yb3VuZCh0cmFpblssbWVhbih5KSxieT1YNV0kVjEsMSkpW29yZGVyKC1OKV0NCmNiaW5kKHRyYWluWywuTixieT1YNl0sTWVhbl9ZPXJvdW5kKHRyYWluWyxtZWFuKHkpLGJ5PVg2XSRWMSwxKSlbb3JkZXIoLU4pXQ0KY2JpbmQodHJhaW5bLC5OLGJ5PVg4XSxNZWFuX1k9cm91bmQodHJhaW5bLG1lYW4oeSksYnk9WDhdJFYxLDEpKVtvcmRlcigtTildI3doZXJlIGlzIDcNCiN3aGVyZSBpcyA5DQoNCmBgYA0KPGJyPg0KDQojIyMjICBGcmVxdWVuY3kgcGxvdHMgZm9yIHRoZSBmaXJzdCAxMCB2YXJpYWJsZXMgKEFsbCB3aXRoIFN0cmluZyBDb2RlcykNCmBgYHtyfQ0KbCA8LSBodG1sdG9vbHM6OnRhZ0xpc3QoKQ0KZm9yIChpIGluIDE6bGVuZ3RoKGNvbHVtbl9uYW1lc19mb3Jfb3B0aW9uX3Bsb3RzX3N0cmluZ19jb2RlcykpIHsgICMNCiAgcGxvdF92YXJpYWJsZSA8LSBjb2x1bW5fbmFtZXNfZm9yX29wdGlvbl9wbG90c19zdHJpbmdfY29kZXNbaV0NCiAgeF92YWx1ZXMgPC0gYXMuZGF0YS5mcmFtZSh0cmFpblssLk4sYnk9cGxvdF92YXJpYWJsZV1bb3JkZXIoLU4pXSlbLHBsb3RfdmFyaWFibGVdDQogIHlfdmFsdWVzIDwtIGFzLmRhdGEuZnJhbWUodHJhaW5bLC5OLGJ5PXBsb3RfdmFyaWFibGVdW29yZGVyKC1OKV0pWywnTiddDQogIGxbW2ldXSA8LSBwbG90X2x5KHg9YXMubGlzdCh4X3ZhbHVlcykseT1hcy5saXN0KHlfdmFsdWVzKSwgdHlwZT0iYmFyIikgJT4lIGxheW91dCh0aXRsZT1wbG90X3ZhcmlhYmxlKQ0KfQ0KbA0KYGBgDQo8YnI+DQoNCiMjIyMgIEZyZXF1ZW5jeSB0YWJsZSBmb3IgdGhlIHJlbWFpbmluZyB2YXJpYWJsZXMgKEFsbCB3aXRoIDAvMSBDb2RpbmcpDQpgYGB7cn0NCmNvbHVtbl9uYW1lc19mb3Jfc3VtbWFyeSA8LSBjb2xuYW1lcyh0cmFpbilbMTE6bGVuZ3RoKGNvbG5hbWVzKHRyYWluKSldDQpWYXJpYWJsZSA8LSBjKCkNCk5fMCA8LSBjKCkNCk5fMSA8LSBjKCkNCk1lYW5fMCA8LSBjKCkNCk1lYW5fMSA8LSBjKCkNCg0KZm9yKGkgaW4gY29sdW1uX25hbWVzX2Zvcl9zdW1tYXJ5KXsNCiAgajwtMQ0KICBWYXJpYWJsZSA8LSBjKFZhcmlhYmxlLGkpDQogIE5fMCA8LSBjKE5fMCx0cmFpblssLk4sYnk9aV1bMV0kTikNCiAgTl8xIDwtIGMoTl8xLHRyYWluWywuTixieT1pXVsyXSROKQ0KICBNZWFuXzAgPC0gYyhNZWFuXzAscm91bmQodHJhaW5bLG1lYW4oeSksYnk9aV1bMV0kVjEsMSkpDQogIE1lYW5fMSA8LSBjKE1lYW5fMSxyb3VuZCh0cmFpblssbWVhbih5KSxieT1pXVsyXSRWMSwxKSkNCiAgaiA8LSBqKzENCn0NCg0KIyBOXzBbaXMubmEoTl8wKT09VFJVRV0gPC0gMA0KIyBOXzFbaXMubmEoTl8xKT09VFJVRV0gPC0gMA0KIyBNZWFuXzBbaXMubmEoTWVhbl8wKT09VFJVRV0gPC0gMA0KIyBNZWFuXzFbaXMubmEoTWVhbl8xKT09VFJVRV0gPC0gMA0KDQpzdW1tYXJ5X3Jlc3VsdHMgPC0gYXMuZGF0YS5mcmFtZShjYmluZChWYXJpYWJsZSxOXzA9Tl8wLE5fMSxNZWFuXzAsTWVhbl8xKSwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQ0Kc3VtbWFyeV9yZXN1bHRzJE5fMCA8LSBhcy5pbnRlZ2VyKHN1bW1hcnlfcmVzdWx0cyROXzApDQpzdW1tYXJ5X3Jlc3VsdHMkTl8xIDwtIGFzLmludGVnZXIoc3VtbWFyeV9yZXN1bHRzJE5fMSkNCnN1bW1hcnlfcmVzdWx0cyRNZWFuXzAgPC0gYXMubnVtZXJpYyhzdW1tYXJ5X3Jlc3VsdHMkTWVhbl8wKQ0Kc3VtbWFyeV9yZXN1bHRzJE1lYW5fMSA8LSBhcy5udW1lcmljKHN1bW1hcnlfcmVzdWx0cyRNZWFuXzEpDQpzdW1tYXJ5X3Jlc3VsdHMkRGVsdGFfTWVhbiA8LSBzdW1tYXJ5X3Jlc3VsdHMkTWVhbl8xIC0gc3VtbWFyeV9yZXN1bHRzJE1lYW5fMA0Kc3VtbWFyeV9yZXN1bHRzDQpgYGANCjxicj4NCg0KIyMjIyAgQWZ0ZXIgTG9va2luZyBhdCBLYWdnbGUsIGNoZWNrZWQgZm9yIER1cGxpY2F0ZSBmaWxlZHMgLSBBZGRlZCBEYXRhIHRvIEZyZXF1ZW5jeSB0YWJsZSBmb3IgdGhlIHJlbWFpbmluZyB2YXJpYWJsZXMgKEFsbCB3aXRoIDAvMSBDb2RpbmcpLg0KYGBge3J9DQp0cmFpbl8yIDwtIHRyYWluWywgIWR1cGxpY2F0ZWQodCh0cmFpbikpXSAjcmVtb3ZlIGR1cGxpY2F0ZWQgZmllbGRzIC4uLiBmcm9tIHJhZGRhckBLYWdnbGUgPT4gaHR0cHM6Ly93d3cua2FnZ2xlLmNvbS9jL21lcmNlZGVzLWJlbnotZ3JlZW5lci1tYW51ZmFjdHVyaW5nL2Rpc2N1c3Npb24vMzQwMDYNCmR1cGxpY2F0ZV9jb2x1bW4gPC0gYygpDQpmb3IoaSBpbiBkdXBsaWNhdGVkKHQodHJhaW4pKSl7DQogIGR1cGxpY2F0ZV9jb2x1bW4gPC0gYyhkdXBsaWNhdGVfY29sdW1uLGkpDQp9DQoNCmR1cGxpY2F0ZV9jb2x1bW4gPC0gZHVwbGljYXRlX2NvbHVtblsxMTpsZW5ndGgoZHVwbGljYXRlX2NvbHVtbildDQoNCnN1bW1hcnlfcmVzdWx0cyRkdXBsaWNhdGVfY29sdW1uIDwtIGR1cGxpY2F0ZV9jb2x1bW4NCg0Kc3VtbWFyeV9yZXN1bHRzX2RlY3JlYXNpbmcgPC0gc3VtbWFyeV9yZXN1bHRzW29yZGVyKC1OXzEpLF0NCg0Kc3VtbWFyeV9yZXN1bHRzDQpzdW1tYXJ5X3Jlc3VsdHNfZGVjcmVhc2luZw0KYGBgDQo8YnI+DQoNCiMjIyMgIExpc3Qgb2YgRHVwbGljYXRlZCBDb2x1bW5zDQpgYGB7cn0NCnN1bW1hcnlfcmVzdWx0c19kZWNyZWFzaW5nWyhzdW1tYXJ5X3Jlc3VsdHNfZGVjcmVhc2luZyRkdXBsaWNhdGVfY29sdW1uPT1UUlVFKSxdDQpgYGANCjxicj4NCg0KIyMjIyAgQ3JlYXRlIFJlc3BvbnNlIFBsb3RzIGZvciBhbGwgQmluYXJ5IE9wdGlvbnMNCmBgYHtyfQ0KdHJhaW5fZGYgPC0gYXMuZGF0YS5mcmFtZSh0cmFpbikNCnRyYWluX2RmX2RlZHVwZSA8LSB0cmFpbl9kZlssICFkdXBsaWNhdGVkKHQodHJhaW5fZGYpKV0gI3JlbW92ZSBkdXBsaWNhdGVkIGZpZWxkcyAuLi4gZnJvbSByYWRkYXJAS2FnZ2xlID0+IGh0dHBzOi8vd3d3LmthZ2dsZS5jb20vYy9tZXJjZWRlcy1iZW56LWdyZWVuZXItbWFudWZhY3R1cmluZy9kaXNjdXNzaW9uLzM0MDA2DQoNCmNvbHVtbl9uYW1lc19mb3JfYmluYXJ5X3Bsb3RzIDwtIGNvbG5hbWVzKHRyYWluX2RmX2RlZHVwZSlbMTE6bGVuZ3RoKGNvbG5hbWVzKHRyYWluX2RmX2RlZHVwZSkpXQ0KDQpsIDwtIGh0bWx0b29sczo6dGFnTGlzdCgpDQpmb3IgKGkgaW4gMTpsZW5ndGgoY29sdW1uX25hbWVzX2Zvcl9iaW5hcnlfcGxvdHMpKSB7ICAjDQogIHBsb3RfdmFyaWFibGUgPC0gY29sdW1uX25hbWVzX2Zvcl9iaW5hcnlfcGxvdHNbaV0NCiAgbFtbaV1dIDwtIHBsb3RfbHkoeD10cmFpbl9kZl9kZWR1cGVbLHBsb3RfdmFyaWFibGVdLHk9dHJhaW5fZGZfZGVkdXBlJHksIHR5cGU9ImJveCIsIGhlaWdodD0zMDApICU+JSBsYXlvdXQodGl0bGU9cGxvdF92YXJpYWJsZSkNCn0NCmwNCg0KYGBgDQo8YnI+DQo=